

### Basic Setup Code
  # TODO: move to its own file
for(p in c("stringr","lfe",  "readstata13",  "stargazer",  "ggplot2",  
           "survival",  "ggpubr",  "permute",  "tidyr",  "Hmisc","dplyr",
           "pastecs","binsreg",
           "lmtest","sandwich","utils","estimatr","grid","gridExtra")){
  base::suppressPackageStartupMessages(library(p,character.only=TRUE))    
}

cbsa.observables.reported = c("pop2010","static_amenities.wharton_land_use_index","static_amenities.cooling_days","static_amenities.heating_days","static_amenities.sunshine_pct","static_amenities.inv_dist_from_water","static_amenities.latitude","static_albouy.quality_of_life","static_albouy.total_amenity_value","tax.average_tax_rate_50th_perc","tax.average_tax_rate_95th_perc","tax.average_tax_rate_99th_perc","tax.average_tax_rate_999th_perc","bohemia","hhi","average_rent","average_home_value","average_income","average_wage","some_college","unemployment_rate","venture_capital.num_of_vc_firms","venture_capital.total_raised","venture_capital.total_invested","venture_capital.num_startups_financed","patents_granted_2000_to_2015")


#data.bad_cbsa = c(46540,14020,43780,48620,26580,12580,49660,43620,49620,44300,38300,25420,10900,20100, 48620, 49620,  44940,  43780,  19260, 24220, 33460,10090,10580, 10740, 11100, 99932, 17420, 17780,	99927	)


data.bad_states <- c()

bad.cbsas = c(10900,  # Allentown-Bethlehem-Easton, PA-NJ
              41760,  # Sandpoint, ID
              #47900, # Washington-Arlington-Alexandria, DC-VA-MD-WV
              12580,  # Baltimore-Columbia-Towson, MD
              45940,  # Trenton-Princeton, NJ
              12300,  # Augusta-Waterville, ME
              34820   #  Myrtle Beach-Conway-North Myrtle Beach, SC-NC 
)

data.bad_cbsa = bad.cbsas

get_state_pop <- function() {
  return(
    read.dta13("./data/statepopdata.dta") %>%
      rename("state"="deststate")
  )
}









estimate_true_migration_rate <- function(state_flows, total_firms,  movevar) {
  
  names(state_flows)
  
  
  #movevar = "moves"
  state_to_state <- state_flows %>%
    mutate(sourcestate = as.character(sourcestate),
           deststate = as.character(deststate)) %>%
    select(sourcestate ,deststate, !!movevar) %>%
    group_by(sourcestate,deststate) %>%
    summarise(move5 = sum(!!sym(movevar)))
  
  
  state_to_state <- data.frame(state_to_state)
  
  state_pop <- get_state_pop()
  
  all_states <- data.frame(expand.grid(state_pop$state, state_pop$state))
  colnames(all_states) <- c("sourcestate","deststate")
  
  all_states <- all_states %>%
    inner_join(state_pop,by=c("sourcestate" ="state")) %>%
    rename("source.pop2010" = "pop2010", "source.pop1990"="pop1990","source.popgrowth" = "popgrowth19902010") %>%
    inner_join(state_pop,by=c("deststate" ="state")) %>%
    rename("dest.pop2010" = "pop2010", "dest.pop1990"="pop1990","dest.popgrowth" = "popgrowth19902010") 
  
  
  state_to_state <- state_to_state %>%
    inner_join(all_states, by=c("sourcestate","deststate")) %>%
    filter(sourcestate != deststate) %>%
    filter(!(sourcestate %in% c("AR","DE","IA","MI","MT","NE","NV","ND","OK","PA","SC","SD","WV","WI","WY","DC","AK","MD"))) 
    
  
  
  
  not_matched <- all_states %>%
    anti_join(state_to_state , by=c("sourcestate","deststate")) %>%
    filter(sourcestate %in% state_to_state$sourcestate) %>%
    filter(!(deststate %in% state_to_state$sourcestate)) %>%
    filter(sourcestate != deststate) %>%
    arrange(sourcestate, deststate)
  
  
  
  reg <- glm(move5 ~  log(source.pop2010) + log(dest.pop2010)  + log(source.pop1990) + log(dest.pop1990) + dest.popgrowth  ,
             data=state_to_state %>% filter(!is.na(move5)), family="poisson")
  
  summary(reg)
  
  
  pred <- predict(reg, not_matched, type="response")
  
  
  in_data_migrations <- sum(state_to_state$move5)
  pred_migrations <- sum(pred)
  tot_migrations <- (in_data_migrations + pred_migrations)
  
  print(paste0("total_number of firms: ", total_firms))
  migration_rate <- (in_data_migrations + pred_migrations) / total_firms
  print(paste0("The raw number of migrations is ",sum(state_to_state$move5, na.rm=TRUE),
               ". Predicted new migrations is ",pred_migrations))
  print(paste0("  The original migration rate is ", in_data_migrations/total_firms, ", The true migration rate is ", migration_rate))
  
  return (migration_rate)
}





## Fuctions
bg.fx.load_migration_data_by_firm <- function(only_if_missing = FALSE, corp = TRUE) {
  files.migration_data = "./data/migration.bryan_guzman.prepared.dta"
  files.stata_folder = "./data/"
  
  
  if(only_if_missing & ("migration_data" %in% ls(.GlobalEnv))){
    print("migration_data already exists.  Returning existing object instead of reading file again.")
    return(get("migration_data", envir=.GlobalEnv))
  } else {
    migration_data <<- read.dta13(files.migration_data) 
    migration_data <<- get("migration_data", envir=.GlobalEnv) %>% 
      filter(!(cbsa %in% data.bad_cbsa)) %>%
            mutate(
                cbsa = ifelse(cbsa == 31100,31080,cbsa)
            )
    
    
    if(corp) {
      return(migration_data %>% filter(is_corp == 1))
    } else {
      return(migration_data %>% filter(is_corp == 0))
    }
  }
}




bg.fx.load_migration_data_by_cbsa <- function(include_llcs=FALSE, filepath = NULL) {
  
  #setwd("C:\\Users\\jag2367\\Dropbox (MIT)\\Bryan_Guzman\\Bryan_Guzman_Migration\\Official\\R")
  
  print("bg.fx.load_migration_data_by_cbsa ")
  if(!is.null(filepath)){
    cbsa_to_cbsa = read.csv(file=filepath, header=TRUE)[-1,-2]
  }else if(!include_llcs){
    cbsa_to_cbsa = read.csv(file="./data/cbsa_matrix.csv", header=TRUE)[-1,-2]
  } else {
    cbsa_to_cbsa = read.csv(file="./data/cbsa_matrix_llc.csv", header=TRUE)[-1,-2]
  }
    
  colnames(cbsa_to_cbsa)[1] <- "cbsacode"
  
  cbsa_to_cbsa <- cbsa_to_cbsa %>% 
    gather(key="dest.cbsa",value="move5", colnames(cbsa_to_cbsa[,-1])) %>%
    mutate(dest.cbsa = as.numeric(gsub("\\.\\d","",gsub("X","",dest.cbsa))),
           source.cbsa=cbsacode,
           moveq = 1,
           move5=as.numeric(move5)) 
  
  cbsa_to_cbsa <- cbsa_to_cbsa %>%
    group_by(source.cbsa, dest.cbsa) %>%
    summarise(
      move5= sum(move5),
      moveq = mean(moveq),
      move5recpi = sum(move5)
    )
  
  cbsa_to_cbsa <- data.frame(cbsa_to_cbsa)
  
  
  
  # cbsa_to_cbsa <- cbsa_to_cbsa %>% 
  #   rename(source.cbsa = sourcemsa, dest.cbsa = destmsa) %>%
  #   mutate(move5 = replace_na(move5,0),
  #          move5recpi = move5 * replace_na(moveq,0),
  #          move5.nonzero = ifelse(move5 == 0, NA, move5)) 
  

  #%>%  filter(!(source.cbsa %in% data.bad_cbsa) & !(dest.cbsa %in% data.bad_cbsa))
  
  return(cbsa_to_cbsa)
}



get_agg_cbsa <- function(refresh=FALSE, include_llcs=FALSE) {
  
  remove.cbsas <- c(42140,37980,19820)
  
  
  
  cbsa_to_cbsa <- bg.fx.load_migration_data_by_cbsa(include_llcs)%>%
    dplyr::group_by(source.cbsa,dest.cbsa) %>%
    dplyr::summarise(
      move5 = sum(replace_na(move5,0)),
      moveq = mean(moveq,  na.rm=TRUE),
      move5recpi = sum(move5 * replace_na(moveq,0), na.rm=TRUE),
      move5.nonzero = ifelse(move5 == 0, NA,move5)
    )%>%
    dplyr::select(source.cbsa, dest.cbsa, move5, moveq, move5recpi, move5.nonzero) %>%
    dplyr::filter(!(source.cbsa %in% remove.cbsas) & !(dest.cbsa %in% remove.cbsas))
  
  
  
  return(data.frame(cbsa_to_cbsa))
}




get_moves_in_out_from_migration_cbsa <- function(migration.cbsa){
  
  movesin <- migration.cbsa  %>% group_by(source.cbsa) %>% summarise(moves.out=sum(move5)) %>%
    rename("cbsa"="source.cbsa")
  
  movesout <- migration.cbsa  %>% group_by(dest.cbsa) %>% summarise(moves.in=sum(move5)) %>% 
    rename("cbsa"="dest.cbsa")
  
  moves.net <- full_join(movesin,movesout, by=c("cbsa")) %>%
    mutate(moves.ratio = moves.out/moves.in,
           moves.out = replace_na(moves.out,0),
           moves.in = replace_na(moves.in, 0),
           moves.total = moves.in + moves.out) 
  
  
  return (moves.net)
}




bg.fx.load_migration_data_by_state <- function() {
  state_to_state <- read.dta13("./data/state_to_state.long.dta")
  
  state_to_state <- state_to_state %>% 
    filter(sourcestate!= "" & deststate != "" & sourcestate!=deststate)  %>%
    rename(move5.corp = move5_corp) %>%
    mutate(move5 = replace_na(move5,0),
           move5.corp = replace_na(move5.corp,0),
           move5.llc = move5 - move5.corp,  
           move5.recpi = move5.corp * moveq
    )
  
  
  state_to_state <- state_to_state %>% 
    group_by(sourcestate, deststate) %>%
    summarise(
      move5.corp = sum(move5.corp),
      move5.llc = sum(move5.llc),
      move5 = sum(move5),
      move5.recpi = sum(move5.recpi)
    ) %>% mutate(
    moveq = move5.recpi/move5.corp,
    move5.corp.nonzero = ifelse(move5.corp <= 0, NA, move5.corp),
    move5.llc.nonzero = ifelse(move5.llc <= 0, NA, move5.llc)
  )
  
  
  
  
  return(state_to_state)
}






get_agg_state <- function() {
  state_to_state <- bg.fx.load_migration_data_by_state() %>%
    group_by(sourcestate,deststate) %>%
    summarise(
      move5 = sum(replace_na(move5,0)) ,
      moveq = mean(moveq, na.rm=TRUE),
      move5recpi = sum(move5 * replace_na(moveq,0)),
      move5.nonzero = ifelse(move5 == 0, NA, move5)
    ) %>%
    select(sourcestate, deststate, move5, moveq, move5recpi, move5.nonzero) %>%
    mutate(
      moveq = ifelse(move5 == 0 ,NA, moveq)
    )
  
  return(state_to_state)
}


bg.fx.load_migration_data_by_cbsa_with_external <- function() { 
  load("bg.cbsa.panel.external_data.Rda")
  dest.external <- data.frame(bg.external)
  colnames(dest.external) = paste0("dest.",colnames(dest.external))
  dest.external <- dest.external %>% rename(cbsa = dest.cbsa, year=dest.year) 
  
  
  ## This creates
  raw.migration.flows <- bg.fx.load_migration_data_by_cbsa()
  
  migration.flows <- raw.migration.flows %>% 
    left_join(bg.external, by=c("source.cbsa"="cbsa","year"="year")) %>%
    left_join(dest.external, by=c("dest.cbsa"="cbsa","year"="year"))
  
  migration.flows <- migration.flows %>%
    filter(!(state %in% data.bad_states) & !(dest.state %in% data.bad_states)) %>%
    filter(!is.na(year))
  
  return(migration.flows)
  
}

bg.fx.load_external <- function() {
  load("bg.cbsa.panel.external_data.matched.Rda")
  bg.external <<- bg.external
}

get_external_data <- function() { 
  bg.fx.load_external()
  return(get("bg.external",envir=.GlobalEnv))
}


get_cbsa_gdp <- function(){
  x = read.dta13("../Stata/RGDP.dta") %>%
    filter(incyear < 2013) %>%
    mutate(first.gdp = ifelse(incyear==2008, rgdp, NA),
           last.gdp = ifelse(incyear==2011,rgdp, NA)) %>%
    group_by(cbsa) %>%
    summarise(first.gdp = min(first.gdp, na.rm=na.omit),
              last.gdp = max(last.gdp, na.rm=na.omit),
              mean.gdp = mean(rgdp)
              ) %>%
      mutate(gdp.growth = first.gdp/last.gdp)
    
    return(data.frame(x))
}

get_startup_births <- function(years = c(1988:2015)) {
  
  migration.by.firms <-bg.fx.load_migration_data_by_firm(only_if_missing = TRUE) 
  
  
  cities.births <- migration.by.firms %>%
    filter(incyear %in% years) %>% 
    mutate(obs = 1) %>%
    group_by(cbsa) %>%
    summarise(city.recpi = sum(quality),
              city.DEobs = sum(obs),
              city.corporations = sum(is_corp),
              city.llcs = sum(1-is_corp))
  
  
  return (data.frame(cities.births) %>% mutate(cbsa=as.numeric(cbsa)))
  
}

get_college_towns <- function(){
  x <- read.dta13("..\\External_Data\\College Education\\college_towns.dta")
  return(x)
}



get_cbsa_names <- function(){
  x <- read.dta13("./data/cbsa_list.dta")  %>%
    select(cbsa,area) %>%
    mutate(cbsa = as.numeric(cbsa)) %>%
    rename("cbsa.name" = "area")
  
  return (x)
}


get_cbsa_pop <- function() {
  
   
  cbsa.pop = read.csv("./data/PEP_2014_GCTPEPANNR.US24PR.csv",
                      colClasses = c(rep("character",7),rep("double",7))) %>% 
                    rename(cbsa=GC.target.geo.id2, pop2010=rescen42010) %>%
                    filter(cbsa != "" & cbsa != "72") %>%
                    mutate(cbsa=as.integer(cbsa)) %>%
                    select(cbsa,pop2010)

  return(cbsa.pop)
}






